{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys,os\n",
    "import csv\n",
    "import pickle\n",
    "import scipy\n",
    "import numpy as np\n",
    "import json\n",
    "from IPython.display import clear_output\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.neural_network import MLPClassifier\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Computing Scores\n",
    "\n",
    "Use this nb to compute all the pop. scores for all the parties in each nation. The nb uses all the parties in the test sets, plus all the parties ecluded from the training set in the \"parties_to_exclude\" dictionary from nb \"00_generate_bag_of_words.ipynb\".\n",
    "\n",
    "In the __Configuration__ section, fill the \"nations_params\" dictionary with the kind of classifier used for the corresponding nation, the target score used in the Grid Search and the seed for the random number generators. Check \"training_results.json\" for possible values.\n",
    "\n",
    "\n",
    "The ouput data will be saved in .csv format into the \"scores\" folder. \n",
    "\n",
    "__Note__\n",
    "\n",
    "For Spain we will not compute the score for all the regionalist parties."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Configuration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "nations_params ={\n",
    "    \"IT\":{\n",
    "        \"model\":\"RandomForest\",\n",
    "        \"target\": \"AUC\",\n",
    "        \"random_state\":1\n",
    "    },\n",
    "    \"FR\":{\n",
    "        \"model\":\"RandomForest\",\n",
    "        \"target\": \"AUC\",\n",
    "        \"random_state\":1\n",
    "    },\n",
    "    \"ES\":{\n",
    "        \"model\":\"RandomForest\",\n",
    "        \"target\": \"AUC\",\n",
    "        \"random_state\":1\n",
    "    },\n",
    "    \"DE\":{\n",
    "        \"model\":\"RandomForest\",\n",
    "        \"target\": \"AUC\",\n",
    "        \"random_state\":1\n",
    "    },\n",
    "    \"AT\":{\n",
    "        \"model\":\"RandomForest\",\n",
    "        \"target\": \"AUC\",\n",
    "        \"random_state\":15\n",
    "    },\n",
    "    \"NL\":{\n",
    "        \"model\":\"RandomForest\",\n",
    "        \"target\": \"AUC\",\n",
    "        \"random_state\":1\n",
    "    },\n",
    "    \"IT_speeches\":{\n",
    "        \"model\":\"RandomForest\",\n",
    "        \"target\": \"AUC\",\n",
    "        \"random_state\":2\n",
    "    },\n",
    "    \"IT_manual\":{\n",
    "        \"model\":\"RandomForest\",\n",
    "        \"target\": \"F1\",\n",
    "        \"random_state\":3232\n",
    "    },\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Parties data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "parties_to_exclude = {\n",
    "    \"IT\":['Forward Italy', 'PdL', 'Italy of Values', 'Casapound', 'Houses of Freedom'],\n",
    "    \"IT_speeches\":['Forward Italy', 'PdL', 'Italy of Values', 'Casapound', 'Houses of Freedom'],\n",
    "    \"IT_manual\":[],\n",
    "    \"FR\":['The Greens','French Communist Party', \"Nouveau Parti Anticapitaliste\", \"Resistons\",'Debout la France'],\n",
    "    \"AT\":['Peter Pilz List'],\n",
    "    \"NL\":['DENK','Party for the Animals','Reformed Political Party','50Plus','Green Left'],\n",
    "    \"ES\":[\"Citizens\"],\n",
    "    \"DE\":['Pirates']\n",
    "}\n",
    "\n",
    "populist_parties = {\n",
    "    \"IT\":['Northern League', 'PaP', 'M5S', 'Brothers of Italy'],\n",
    "    \"FR\":['National Front','Indomitable France'],\n",
    "    \"AT\":['Austrian Freedom Party','Alliance for the Future of Austria','Team Stronach for Austria'],\n",
    "    \"NL\":['Party of Freedom','List Pim Fortuyn','Socialist Party','Forum for Democracy'],\n",
    "    \"ES\":['We can','In Common We Can',\"Vox\"],\n",
    "    \"DE\":['The Left','Alternative for Germany']\n",
    "    \n",
    "}\n",
    "\n",
    "spanish_regionalist_parties = ['Amaiur',\n",
    "                 'Andalusian Party',\n",
    "                 'Aragonist Council',\n",
    "                 'Basque Country Unite',\n",
    "                 'Basque Nationalist Party',\n",
    "                 'Basque Solidarity',\n",
    "                 'Canarian Coalition',\n",
    "                 'Catalan Republican Left',\n",
    "                 'Citizens',\n",
    "                 'Commitment-Q',\n",
    "                 'Commitment-We can-It is time',\n",
    "                 'Democratic Convergence of Catalonia',\n",
    "                 'Forum Asturias',\n",
    "                 'Future Yes',\n",
    "                 'Galician Nationalist Bloc',\n",
    "                 'In Tide',\n",
    "                 \"Navarrese People's Union\",\n",
    "                 'Valencian style']\n",
    "\n",
    "nations = populist_parties.keys()\n",
    "\n",
    "nations = list(populist_parties.keys())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "reading model for IT...\n",
      "reading test data for IT...\n",
      "computing test data scores for IT...\n",
      "reading excluded parties scores for IT...\n",
      "finding all words for IT excluded parties...\n",
      "generating words indices for IT excluded parties...\n",
      "computing excluded parties scores for IT...\n",
      "saving scores for IT...\n",
      "\n",
      "reading model for FR...\n",
      "reading test data for FR...\n",
      "computing test data scores for FR...\n",
      "reading excluded parties scores for FR...\n",
      "finding all words for FR excluded parties...\n",
      "generating words indices for FR excluded parties...\n",
      "computing excluded parties scores for FR...\n",
      "saving scores for FR...\n",
      "\n",
      "reading model for AT...\n",
      "reading test data for AT...\n",
      "computing test data scores for AT...\n",
      "reading excluded parties scores for AT...\n",
      "finding all words for AT excluded parties...\n",
      "generating words indices for AT excluded parties...\n",
      "computing excluded parties scores for AT...\n",
      "saving scores for AT...\n",
      "\n",
      "reading model for NL...\n",
      "reading test data for NL...\n",
      "computing test data scores for NL...\n",
      "reading excluded parties scores for NL...\n",
      "finding all words for NL excluded parties...\n",
      "generating words indices for NL excluded parties...\n",
      "computing excluded parties scores for NL...\n",
      "saving scores for NL...\n",
      "\n",
      "reading model for ES...\n",
      "reading test data for ES...\n",
      "computing test data scores for ES...\n",
      "reading excluded parties scores for ES...\n",
      "finding all words for ES excluded parties...\n",
      "generating words indices for ES excluded parties...\n",
      "computing excluded parties scores for ES...\n",
      "saving scores for ES...\n",
      "\n",
      "reading model for DE...\n",
      "reading test data for DE...\n",
      "computing test data scores for DE...\n",
      "reading excluded parties scores for DE...\n",
      "finding all words for DE excluded parties...\n",
      "generating words indices for DE excluded parties...\n",
      "computing excluded parties scores for DE...\n",
      "saving scores for DE...\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for nation in nations:\n",
    "        \n",
    "    print(\"reading model for {}...\".format(nation))\n",
    "    model_type, target, random_state = nations_params[nation].values()\n",
    "\n",
    "    params = pickle.load(open(\"./models/{0}_{1}_{2}_{3}_best_model_params.pkl\".format(nation, model_type, target, random_state),'rb'))\n",
    "    model = pickle.load(open(\"./models/{0}_{1}_{2}_{3}_best_model.pkl\".format(nation, model_type, target, random_state),\"rb\"))\n",
    "    indexes_test = pickle.load(open(\"./models/{0}_{1}_{2}_{3}_test_indexes.pkl\".format(nation, model_type, target, random_state),'rb'))\n",
    "    max_thresh = params[\"threshold\"]\n",
    "    \n",
    "    print(\"reading test data for {}...\".format(nation))\n",
    "    X = pickle.load(open(\"./bow_and_labels/X_{}_sentences.pkl\".format(nation), \"rb\"))[indexes_test]\n",
    "    Y = pickle.load(open(\"./bow_and_labels/Y_{}_sentences.pkl\".format(nation), \"rb\"))[indexes_test]\n",
    "    parties = pickle.load(open(\"./bow_and_labels/parties_{}_sentences.pkl\".format(nation), \"rb\"))[indexes_test]\n",
    "    years = pickle.load(open(\"./bow_and_labels/years_{}_sentences.pkl\".format(nation), \"rb\"))[indexes_test]\n",
    "    \n",
    "    \n",
    "    print(\"computing test data scores for {}...\".format(nation))    \n",
    "    s = (model.predict_proba(X)[:,1]>max_thresh)\n",
    "    global_scores = {}\n",
    "    global_scores_counts = {}\n",
    "    score_in_time = {}\n",
    "    score_in_time_counts = {}\n",
    "\n",
    "    for party in set(parties):\n",
    "        iii = np.where(parties==party)[0]\n",
    "        global_scores[party] = np.mean(s[iii])\n",
    "        global_scores_counts[party] = len(s[iii])\n",
    "\n",
    "    for party, year in zip(parties, years):\n",
    "        iii = np.where((parties==party) & (years==year))[0]\n",
    "        score_in_time[(party, year)] = np.mean(s[iii])\n",
    "        score_in_time_counts[(party, year)] = len(s[iii])\n",
    "    \n",
    "    \n",
    "    print(\"reading excluded parties scores for {}...\".format(nation))    \n",
    "\n",
    "    if \"speeches\" in nation or \"manual\" in nation:\n",
    "        data = json.load(open(\"./datasets/{}_sentences.json\".format(nation),\"r\"))\n",
    "    else:\n",
    "        data = json.load(open(\"./datasets/{}_manifesto_sentences.json\".format(nation),\"r\"))\n",
    "\n",
    "    ###add party orientation\n",
    "    party_orientation = {}\n",
    "    for record in data:\n",
    "        party = record[\"party\"]\n",
    "        orientation = record[\"orientation\"]\n",
    "        party_orientation[party] = orientation\n",
    "        \n",
    "    print(\"finding all words for {} excluded parties...\".format(nation))\n",
    "    \n",
    "    if nation!=\"ES\": excluded_parties = parties_to_exclude[nation]\n",
    "    else: excluded_parties = parties_to_exclude[nation] + spanish_regionalist_parties\n",
    "    \n",
    "    if len(excluded_parties)!=0:\n",
    "        counts = {}\n",
    "        N_sentences = 0\n",
    "        for record in data:\n",
    "            clean_text = record[\"clean_text\"]   \n",
    "\n",
    "            if record[\"party\"] in excluded_parties:\n",
    "                N_sentences +=1\n",
    "                continue\n",
    "\n",
    "            for word in clean_text:\n",
    "                try: counts[word]+=1\n",
    "                except KeyError: counts[word]=1\n",
    "                    \n",
    "        print(\"generating words indices for {} excluded parties...\".format(nation))\n",
    "\n",
    "        to_del= [word for word in counts if counts[word]<=4 or len(word)<=2]\n",
    "        for word in to_del: \n",
    "            del counts[word]\n",
    "\n",
    "        words_list = [w for w in counts.keys()]\n",
    "        word_index = {}\n",
    "        for w in words_list: word_index[w] = len(word_index)\n",
    "        N = len(word_index)\n",
    "\n",
    "\n",
    "        X_excluded = np.zeros((N_sentences,N))\n",
    "        parties_excluded= []\n",
    "        years_excluded = []\n",
    "\n",
    "        i=0\n",
    "        for record in data:\n",
    "            clean_text = record[\"clean_text\"]\n",
    "            party = record[\"party\"]\n",
    "            year = record[\"year\"]\n",
    "            if party not in parties_to_exclude[nation]:continue\n",
    "\n",
    "            for w in clean_text:\n",
    "                try: j = word_index[w]\n",
    "                except KeyError: continue\n",
    "                X_excluded[i,j] = 1\n",
    "\n",
    "            parties_excluded.append(party)\n",
    "            years_excluded.append(year)\n",
    "            i+=1\n",
    "        parties_excluded = np.array(parties_excluded)\n",
    "        years_excluded = np.array(years_excluded)\n",
    "\n",
    "        print(\"computing excluded parties scores for {}...\".format(nation))    \n",
    "\n",
    "\n",
    "        s_excluded = (model.predict_proba(X_excluded)[:,1]>max_thresh)\n",
    "\n",
    "\n",
    "        for party in set(parties_excluded):\n",
    "            iii = np.where(parties_excluded==party)[0]\n",
    "            global_scores[party] = np.mean(s_excluded[iii])\n",
    "            global_scores_counts[party] = len(s_excluded[iii])\n",
    "\n",
    "        for party, year in zip(parties_excluded, years_excluded):\n",
    "            iii = np.where((parties_excluded==party) & (years_excluded==year))[0]\n",
    "            score_in_time[(party, year)] = np.mean(s_excluded[iii])\n",
    "            score_in_time_counts[(party, year)] = len(s_excluded[iii])\n",
    "\n",
    "    else:\n",
    "        print(\"no excluded parties, skipping..\")\n",
    "\n",
    "    print(\"saving scores for {}...\".format(nation))    \n",
    "\n",
    "    global_scores_df = pd.DataFrame({\"party\":global_scores.keys(),\"score\":global_scores.values()})\n",
    "    global_scores_df[\"orientation\"] = [party_orientation[party] for party in global_scores_df.party]\n",
    "    global_scores_df[\"counts\"] = [global_scores_counts[party] for party in global_scores_df.party]\n",
    "\n",
    "    global_scores_df.to_csv(\"./scores/global_scores_{}.csv\".format(nation), index=False)\n",
    "\n",
    "    score_in_time_df = pd.DataFrame({\"party\":[k[0] for k in score_in_time.keys()],\"year\":[k[1] for k in score_in_time.keys()],\"score\":score_in_time.values()})\n",
    "    score_in_time_df[\"orientation\"] = [party_orientation[party] for party in score_in_time_df.party]\n",
    "    score_in_time_df[\"counts\"] = [score_in_time_counts[(party,year)] for party,year in score_in_time_df[[\"party\", \"year\"]].values]\n",
    "\n",
    "    score_in_time_df.to_csv(\"./scores/scores_in_time_{}.csv\".format(nation), index=False)\n",
    "    \n",
    "    print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
